All the group member participated in all the two assignments, and after discussion, formed this report.
Please see the resulting picture below:
#read the data from the txt file
senic <- read.table("SENIC.txt", header = FALSE)
names(senic) <- c("ID",
"Length of Stay",
"Age",
"Infection Risk",
"Routine Culturing Ratio",
"Routine Chest X-ray Ratio",
"Number of Beds",
"Medical School Affiliation",
"Region",
"Average Daily Census",
"Number of Nurses",
"Available Facilities & Services")
Q22_function <- function(input_vector){
Q1 <- quantile(input_vector, 0.25)
Q3 <- quantile(input_vector, 0.75)
threshold <- 1.5 * (Q3 - Q1)
outliers_indices <- which(input_vector > (Q3 + threshold) | input_vector < (Q1 - threshold))
return(outliers_indices)
}
Infection_risk_outliers_indecies <- Q22_function(senic$`Infection Risk`)
outlier_data <- senic[Infection_risk_outliers_indecies,]
plot23 <- ggplot(data=senic)+
geom_density(aes(x=`Infection Risk`))+
geom_point(data=outlier_data,aes(x=`Infection Risk`),y=0,shape = 5,col="blue")
plot23
By observing the graph, it can be noticed that the slope is steeper on the right-hand side compared to the left-hand side. In terms of the outliers, three of them are located at higher values, and the other two are at lower values.
quantitative_variables <- c("Length of Stay",
"Age",
"Infection Risk",
"Routine Culturing Ratio",
"Routine Chest X-ray Ratio",
"Number of Beds",
"Average Daily Census",
"Number of Nurses",
"Available Facilities & Services")
plot <- function(name){
outliers <- senic[Q22_function(senic[,name]),name]
plotdata <- senic[,name]
density_plot <- ggplot()+
geom_density(data = senic,aes(x=plotdata))+
xlab(name)
if(length(outliers)>0){ #This is to ensure when there is no outlier, the plot will still work
density_plot <- density_plot+
geom_point(aes(x=outliers),y=0,shape = 5,col="blue")
}
return(density_plot)
}
plot_list <- lapply(quantitative_variables,plot)
grid_plot <- grid.arrange(grobs=plot_list,
top=("Density Plot of All Quantitative Variables"))
By observing the above graph, it can be noticed that “Length of
Stay”,“Routine Culturing Ratio”,“Number of Beds”,“Average Daily Census”
and “Number of Nurses” are right skewed, also their outliers appears at
higher value.
On the other hand “Age”, “Infection Risk”,“Routine Chese X-ray Ratio”
and “Available Facilities & Service” are close to symmetric, where
the “Available Facilities & Service” does not contain outliers.
plotQ2_5 <- ggplot(data=senic)+
geom_point(aes(x=`Infection Risk`, y= `Number of Nurses`,color=`Number of Beds`))
print(plotQ2_5)
Compared to the graph in step 4, the above graph can let one
investigate the correlation between two variables (In this case,
Infection Risk and Number of Nurses).
In terms of the color scale, there are a few potential problems with it.
First, the shade of the color blue may not be easily distinguishable by
the human eye. Second, while adding a color scale is a way to expand the
dimensions of the graph, the correlation it represents might not be
intuitive for the reader.
ggplotly(plot23)
As the above graph shown, the graph made by ggplot2 can directly be constructed into a plotly graph. Compared to the previous graph, the plotly creates an interactive graph. We can use different feature buttons to control this graph. In addition, we can directly hover the cursor on the graph to display the current value.
outliers <- senic$`Infection Risk`[Q22_function(senic$`Infection Risk`)]
figure <- senic %>% mutate(ifoutliers=is.element(senic$`Infection Risk`,outliers)) %>%
plot_ly(x = ~`Infection Risk`, type="histogram")%>%
add_trace(x = ~`Infection Risk`[ifoutliers], y = 0,type="scatter",
mode="markers",marker=list(symbol="diamond"))%>%
layout(title="The histogram of Infection Risk and its outliers")
figure
Q28ui <- fluidPage(
checkboxGroupInput(inputId = "variables",label = "Choose Variables", c(quantitative_variables)),
sliderInput(inputId="bw", label="Choose bandwidth size", value=0.2,min=0.02, max=10.0),
plotOutput("densPlot")
)
Q28server <- function(input, output) {
output$densPlot <- renderPlot({
validate(need(input$variables, "Please select variables."))
plotQ28 <- function(name){
outliers <- senic[Q22_function(senic[,name]),name]
plotdata <- senic[,name]
density_plot <- ggplot()+
geom_density(data = senic,aes(x=plotdata),bw=input$bw)+
xlab(name)
if(length(outliers)>0){
density_plot <- density_plot+
geom_point(aes(x=outliers),y=0,shape = 5,col="blue")
}
return(density_plot)
}
plot_list<-lapply(input$variables,plotQ28)
grid.arrange(grobs=plot_list,top=("density plot of Variables"))
})
}
shinyApp(ui=Q28ui, server=Q28server)
The curve will become smoother when the bandwidth value increase and
vice versa.
However, there is no such bandwidth value that is optimal to all of the
variables. Since every variable span on different range. If increasing
the bandwidth, the variable that span in larger range might have a
better and smoother plot, but the variable that span in smaller range
will become too smooth and lose important information. For example,
setting the bandwidth to 10 for ‘Infection Risk’ can demonstrate this
issue.
knitr::opts_chunk$set(echo = TRUE)
rm(list = ls())
library(ggplot2)
library(gridExtra)
library(plotly)
library(shiny)
#read the data from the txt file
senic <- read.table("SENIC.txt", header = FALSE)
names(senic) <- c("ID",
"Length of Stay",
"Age",
"Infection Risk",
"Routine Culturing Ratio",
"Routine Chest X-ray Ratio",
"Number of Beds",
"Medical School Affiliation",
"Region",
"Average Daily Census",
"Number of Nurses",
"Available Facilities & Services")
Q22_function <- function(input_vector){
Q1 <- quantile(input_vector, 0.25)
Q3 <- quantile(input_vector, 0.75)
threshold <- 1.5 * (Q3 - Q1)
outliers_indices <- which(input_vector > (Q3 + threshold) | input_vector < (Q1 - threshold))
return(outliers_indices)
}
Infection_risk_outliers_indecies <- Q22_function(senic$`Infection Risk`)
outlier_data <- senic[Infection_risk_outliers_indecies,]
plot23 <- ggplot(data=senic)+
geom_density(aes(x=`Infection Risk`))+
geom_point(data=outlier_data,aes(x=`Infection Risk`),y=0,shape = 5,col="blue")
plot23
quantitative_variables <- c("Length of Stay",
"Age",
"Infection Risk",
"Routine Culturing Ratio",
"Routine Chest X-ray Ratio",
"Number of Beds",
"Average Daily Census",
"Number of Nurses",
"Available Facilities & Services")
plot <- function(name){
outliers <- senic[Q22_function(senic[,name]),name]
plotdata <- senic[,name]
density_plot <- ggplot()+
geom_density(data = senic,aes(x=plotdata))+
xlab(name)
if(length(outliers)>0){ #This is to ensure when there is no outlier, the plot will still work
density_plot <- density_plot+
geom_point(aes(x=outliers),y=0,shape = 5,col="blue")
}
return(density_plot)
}
plot_list <- lapply(quantitative_variables,plot)
grid_plot <- grid.arrange(grobs=plot_list,
top=("Density Plot of All Quantitative Variables"))
plotQ2_5 <- ggplot(data=senic)+
geom_point(aes(x=`Infection Risk`, y= `Number of Nurses`,color=`Number of Beds`))
print(plotQ2_5)
ggplotly(plot23)
outliers <- senic$`Infection Risk`[Q22_function(senic$`Infection Risk`)]
figure <- senic %>% mutate(ifoutliers=is.element(senic$`Infection Risk`,outliers)) %>%
plot_ly(x = ~`Infection Risk`, type="histogram")%>%
add_trace(x = ~`Infection Risk`[ifoutliers], y = 0,type="scatter",
mode="markers",marker=list(symbol="diamond"))%>%
layout(title="The histogram of Infection Risk and its outliers")
figure
Q28ui <- fluidPage(
checkboxGroupInput(inputId = "variables",label = "Choose Variables", c(quantitative_variables)),
sliderInput(inputId="bw", label="Choose bandwidth size", value=0.2,min=0.02, max=10.0),
plotOutput("densPlot")
)
Q28server <- function(input, output) {
output$densPlot <- renderPlot({
validate(need(input$variables, "Please select variables."))
plotQ28 <- function(name){
outliers <- senic[Q22_function(senic[,name]),name]
plotdata <- senic[,name]
density_plot <- ggplot()+
geom_density(data = senic,aes(x=plotdata),bw=input$bw)+
xlab(name)
if(length(outliers)>0){
density_plot <- density_plot+
geom_point(aes(x=outliers),y=0,shape = 5,col="blue")
}
return(density_plot)
}
plot_list<-lapply(input$variables,plotQ28)
grid.arrange(grobs=plot_list,top=("density plot of Variables"))
})
}
shinyApp(ui=Q28ui, server=Q28server)